In [11]:
%matplotlib inline
import matplotlib.pyplot as plt

In [9]:
xlow = 0
xhigh = np.pi/2
x = np.linspace(xlow, xhigh, num = 100)
y = np.cos(x)

In [10]:
plt.figure()
plt.plot(x, y)
plt.show()



In [18]:
dim = 100
x_mcmc = np.random.random(dim) * (xhigh - xlow) + xlow
y_mcmc = np.cos(x_mcmc)

In [19]:
plt.figure()
plt.plot(x_mcmc, y_mcmc, 'o')
plt.show()



In [21]:
y_mcmc.mean() * (xhigh - xlow)


Out[21]:
0.99904262914854747

In [ ]:
(xhigh - xlow) * np.sqrt(y_mcmc.

In [1]:
from numpy.random import random

Metropolis-Hastings Algorithm


In [23]:
def sdnorm(z):
    """
    Standard normal pdf (Probability Density Function)
    """
    return np.exp(-z*z/2.)/np.sqrt(2*np.pi)

n = 10000
alpha = 1
x = 0.
vec = []
vec.append(x)
innov = random(n) * 2 * alpha - alpha #random inovation, uniform proposal distribution
for i in xrange(1,n):
    can = x + innov[i] #candidate
    aprob = np.min([1.,sdnorm(can)/sdnorm(x)]) #acceptance probability
    u = random(1)
    if u[0] < aprob:
        x = can
        vec.append(x)

In [24]:
len(vec)


Out[24]:
8066

In [25]:
#plotting the results:
#theoretical curve
x = np.arange(-3,3,.1)
y = sdnorm(x)
plt.subplot(211)
plt.title('Metropolis-Hastings')
plt.plot(vec)
plt.subplot(212)

plt.hist(vec, bins=30,normed=1)
plt.plot(x,y,'ro')
plt.ylabel('Frequency')
plt.xlabel('x')
plt.legend(('PDF','Samples'))
plt.show()


my version of the code


In [41]:
def metropolis_hastings(f, init = 0, nsamples = 1000):
    alpha = 1
    sample_count = 0
    x = init
    vec = []
    vec.append(x)
    while sample_count < nsamples-1:
        can = x + random(1)[0] * 2 * alpha - alpha #candidate
        aprob = np.min([1.,sdnorm(can)/sdnorm(x)]) #acceptance probability
        u = random(1)
        if u[0] < aprob:
            x = can
            vec.append(x)
            sample_count+=1
    return np.array(vec)

In [42]:
vec = metropolis_hastings(sdnorm, nsamples=10000)

In [43]:
len(vec)


Out[43]:
10000

In [44]:
#plotting the results:
#theoretical curve
x = np.arange(-3,3,.1)
y = sdnorm(x)
plt.subplot(211)
plt.title('Metropolis-Hastings')
plt.plot(vec)
plt.subplot(212)

plt.hist(vec, bins=30,normed=1)
plt.plot(x,y,'ro')
plt.ylabel('Frequency')
plt.xlabel('x')
plt.legend(('PDF','Samples'))
plt.show()



In [45]:
def gauss(z):
    """
    Standard normal gaussian
    """
    return np.exp(-z*z/2.)

In [48]:
import scipy.integrate as integrate

In [50]:
integrate.quad(gauss, -np.Inf, np.Inf)


Out[50]:
(2.5066282746309994, 2.5512942192316024e-08)

In [51]:
np.sqrt(2*np.pi)


Out[51]:
2.5066282746310002

In [167]:
def triangle(z):
    if (type(z) is not type([1])) and (type(z) is not type(np.array(1))):
        x = np.array([z])
    else:
        x = np.array(z)
    return np.piecewise(x, [x < 0, (x < 10) & (x > 0), (x >= 10) & (x <= 20), x > 20], [0, lambda x: x, lambda x: - x + 20, 0])

In [168]:
triangle([1])


Out[168]:
array([1])

In [169]:
x = np.arange(-100,100,.1)
plt.figure()
plt.plot(x, triangle(x))
plt.show()



In [170]:
type(np.array([1]))


Out[170]:
numpy.ndarray

In [174]:
integrate.quad(triangle, -np.Inf, np.Inf)


Out[174]:
(100.00000004456021, 8.648601692584634e-07)

In [178]:
vec = metropolis_hastings(lambda f: 1/100*triangle(x), nsamples=10000)

In [181]:
x = np.arange(0,100,.1)
y = 1/100 * triangle(x)
plt.subplot(211)
plt.title('Metropolis-Hastings')
plt.plot(vec)
plt.subplot(212)

plt.hist(vec, bins=30,normed=1)
plt.plot(x,y,'ro')
plt.ylabel('Frequency')
plt.xlabel('x')
plt.xlim((-30,30))
plt.legend(('PDF','Samples'))
plt.show()



In [184]:
import pymc

In [185]:
@pymc.stochastic(dtype=int)
def switchpoint(value=1900, t_l=1851, t_h=1962):
    """The switchpoint for the rate of disaster occurrence."""
    if value > t_h or value < t_l:
        # Invalid values
        return -np.inf
    else:
        # Uniform log-likelihood
        return -np.log(t_h - t_l + 1)

In [189]:
from scipy.stats import rv_discrete
from scipy import stats
normdiscrete = stats.rv_discrete(values=(gridint, np.round(probs, decimals=7)), name='normdiscrete')
n_sample = 500
np.random.seed(87655678)   # fix the seed for replicability
rvs = normdiscrete.rvs(size=n_sample)
rvsnd = rvs
f, l = np.histogram(rvs, bins=gridlimits)
sfreq = np.vstack([gridint, f, probs*n_sample]).T
print sfreq


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-189-b50054185a3f> in <module>()
      1 from scipy.stats import rv_discrete
      2 from scipy import stats
----> 3 normdiscrete = stats.rv_discrete(values=(gridint, np.round(probs, decimals=7)), name='normdiscrete')
      4 n_sample = 500
      5 np.random.seed(87655678)   # fix the seed for replicability

NameError: name 'gridint' is not defined

In [219]:
from scipy import stats
class deterministic_gen(stats.rv_continuous):
     def _pdf(self, x):
        return 1/x

In [220]:
deterministic = deterministic_gen(name="deterministic")

In [222]:
deterministic.cdf(np.arange(1, 3, 0.5))


---------------------------------------------------------------------------
ZeroDivisionError                         Traceback (most recent call last)
<ipython-input-222-48cd819759f9> in <module>()
----> 1 deterministic.cdf(np.arange(1, 3, 0.5))

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/stats/distributions.pyc in cdf(self, x, *args, **kwds)
   1342         if any(cond):  # call only if at least 1 entry
   1343             goodargs = argsreduce(cond, *((x,)+args))
-> 1344             place(output,cond,self._cdf(*goodargs))
   1345         if output.ndim == 0:
   1346             return output[()]

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/stats/distributions.pyc in _cdf(self, x, *args)
   1199 
   1200     def _cdf(self, x, *args):
-> 1201         return self.veccdf(x,*args)
   1202 
   1203     def _logcdf(self, x, *args):

/Users/schriste/anaconda/lib/python2.7/site-packages/numpy/lib/function_base.pyc in __call__(self, *args, **kwargs)
   1570             vargs.extend([kwargs[_n] for _n in names])
   1571 
-> 1572         return self._vectorize_call(func=func, args=vargs)
   1573 
   1574     def _get_ufunc_and_otypes(self, func, args):

/Users/schriste/anaconda/lib/python2.7/site-packages/numpy/lib/function_base.pyc in _vectorize_call(self, func, args)
   1636                       for _a in args]
   1637 
-> 1638             outputs = ufunc(*inputs)
   1639 
   1640             if ufunc.nout == 1:

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/stats/distributions.pyc in _cdf_single_call(self, x, *args)
   1196 
   1197     def _cdf_single_call(self, x, *args):
-> 1198         return integrate.quad(self._pdf, self.a, x, args=args)[0]
   1199 
   1200     def _cdf(self, x, *args):

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/integrate/quadpack.pyc in quad(func, a, b, args, full_output, epsabs, epsrel, limit, points, weight, wvar, wopts, maxp1, limlst)
    252         args = (args,)
    253     if (weight is None):
--> 254         retval = _quad(func,a,b,args,full_output,epsabs,epsrel,limit,points)
    255     else:
    256         retval = _quad_weight(func,a,b,args,full_output,epsabs,epsrel,limlst,limit,maxp1,weight,wvar,wopts)

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/integrate/quadpack.pyc in _quad(func, a, b, args, full_output, epsabs, epsrel, limit, points)
    319             return _quadpack._qagse(func,a,b,args,full_output,epsabs,epsrel,limit)
    320         else:
--> 321             return _quadpack._qagie(func,bound,infbounds,args,full_output,epsabs,epsrel,limit)
    322     else:
    323         if infbounds != 0:

<ipython-input-219-9d04ce8ef423> in _pdf(self, x)
      2 class deterministic_gen(stats.rv_continuous):
      3      def _pdf(self, x):
----> 4         return 1/x

ZeroDivisionError: float division by zero

In [223]:
deterministic.pdf(np.arange(-3, 3, 0.5))


-c:4: RuntimeWarning: divide by zero encountered in divide
Out[223]:
array([-0.33333333, -0.4       , -0.5       , -0.66666667, -1.        ,
       -2.        ,         inf,  2.        ,  1.        ,  0.66666667,
        0.5       ,  0.4       ])

In [224]:
deterministic.rvs(size=10)


---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-224-eafdfe3a1ee3> in <module>()
----> 1 deterministic.rvs(size=10)

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/stats/distributions.pyc in rvs(self, *args, **kwds)
    669             return loc*ones(size, 'd')
    670 
--> 671         vals = self._rvs(*args)
    672         if self._size is not None:
    673             vals = reshape(vals, size)

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/stats/distributions.pyc in _rvs(self, *args)
   1192         ## Use basic inverse cdf algorithm for RV generation as default.
   1193         U = mtrand.sample(self._size)
-> 1194         Y = self._ppf(U,*args)
   1195         return Y
   1196 

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/stats/distributions.pyc in _ppf(self, q, *args)
   1211 
   1212     def _ppf(self, q, *args):
-> 1213         return self.vecfunc(q,*args)
   1214 
   1215     def _isf(self, q, *args):

/Users/schriste/anaconda/lib/python2.7/site-packages/numpy/lib/function_base.pyc in __call__(self, *args, **kwargs)
   1570             vargs.extend([kwargs[_n] for _n in names])
   1571 
-> 1572         return self._vectorize_call(func=func, args=vargs)
   1573 
   1574     def _get_ufunc_and_otypes(self, func, args):

/Users/schriste/anaconda/lib/python2.7/site-packages/numpy/lib/function_base.pyc in _vectorize_call(self, func, args)
   1636                       for _a in args]
   1637 
-> 1638             outputs = ufunc(*inputs)
   1639 
   1640             if ufunc.nout == 1:

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/stats/distributions.pyc in _ppf_single_call(self, q, *args)
   1154 
   1155         return optimize.brentq(self._ppf_to_solve,
-> 1156                                left, right, args=(q,)+args, xtol=self.xtol)
   1157 
   1158     # moment from definition

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/optimize/zeros.pyc in brentq(f, a, b, args, xtol, rtol, maxiter, full_output, disp)
    413     if rtol < _rtol:
    414         raise ValueError("rtol too small (%g < %g)" % (rtol, _rtol))
--> 415     r = _zeros._brentq(f,a,b,xtol,rtol,maxiter,args,full_output,disp)
    416     return results_c(full_output, r)
    417 

RuntimeError: Failed to converge after 100 iterations.

In [195]:
deterministic?

In [239]:
def getDistribution(data):
    kernel = stats.gaussian_kde(data)
    class rv(stats.rv_continuous):
        def _rvs(self, *x, **y):
            # don't ask me why it's using self._size 
            # nor why I have to cast to int
            return kernel.resample(int(self._size)) 
        def _cdf(self, x):
            return kernel.integrate_box_1d(-numpy.Inf, x)
        def _pdf(self, x):
            return kernel.evaluate(x)
    return rv(name='kdedist')

In [240]:
f = getDistribution(np.random.randn(100))

In [243]:
f.rvs([1])


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-243-d05ef689c9d3> in <module>()
----> 1 f.rvs([1])

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/stats/distributions.pyc in rvs(self, *args, **kwds)
    669             return loc*ones(size, 'd')
    670 
--> 671         vals = self._rvs(*args)
    672         if self._size is not None:
    673             vals = reshape(vals, size)

<ipython-input-239-8854fe964a63> in _rvs(self, *x, **y)
      5             # don't ask me why it's using self._size
      6             # nor why I have to cast to int
----> 7             return kernel.resample(int(self._size))
      8         def _cdf(self, x):
      9             return kernel.integrate_box_1d(-numpy.Inf, x)

TypeError: int() argument must be a string or a number, not 'NoneType'

In [231]:
def getDistribution(data):
    kernel = stats.gaussian_kde(data)
    class rv(stats.rv_continuous):
        def _cdf(self, x):
            return kernel.integrate_box_1d(-numpy.Inf, x)
        def _pdf(self, x):
            return kernel.evaluate(x)
    return rv(name='kdedist')


Out[231]:
array([ 0.21934769])

In [244]:
kernel = stats.gaussian_kde(np.random.randn(100))

In [245]:
kernel


Out[245]:
<scipy.stats.kde.gaussian_kde at 0x10b0f2a90>

In [246]:
type(kernel)


Out[246]:
scipy.stats.kde.gaussian_kde

In [247]:
kernel.resample(10)


Out[247]:
array([[ 0.73375879, -1.45096952,  0.50167046, -0.27989526, -0.5990068 ,
        -0.35663271, -2.82007892,  0.59557902, -0.71221009,  0.40275279]])

In [282]:
def getDistribution():
    sigma = 1
    mu = 0
    f = lambda x: 1/(sigma * np.sqrt(2 * np.pi)) * np.exp( - (x - mu)**2 / (2 * sigma**2) )
    class rv(stats.rv_continuous):
        def _pdf(self, x):
            return f(x)
        def _cdf(self, x):
            return integrate.quad(f, -np.Inf, x)
    return rv(name='kdedist')

In [283]:
a = getDistribution()

In [288]:
%%timeit
a.rvs(size=100)


1 loops, best of 3: 2.05 s per loop

In [285]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['random', 'f']
`%matplotlib` prevents importing * from pylab and numpy

In [286]:
plt.figure()
plt.hist(a.rvs(size=1000))
plt.show()



In [297]:
integrate.quad(lambda x: x ** 5, 0, 30)


Out[297]:
(121500000.0, 1.3489209749195652e-06)

In [293]:
a = lambda x: x ** 2

In [294]:
a(np.Inf)


Out[294]:
inf

In [ ]:


In [ ]:

Fitting a line


In [162]:
from pymc import stochastic, observed, deterministic, uniform_like, runiform, rnormal, Sampler, Normal, Uniform
from numpy import inf, log, cos,array
import pymc
import numpy.random

true_slope = 1.5
true_intercept = 4
N = 30
true_x = runiform(0,50, N)
true_y = true_slope*true_x + true_intercept

data_y = rnormal(true_y, 2)
data_x = rnormal(true_x, 2)

In [163]:
plt.plot(true_x, true_y, '-')
plt.plot(data_x, data_y, 'o')


Out[163]:
[<matplotlib.lines.Line2D at 0x10c97aad0>]

In [164]:
A = array([ data_x, np.ones(len(data_x))])
lstsq_theta = numpy.linalg.lstsq(A.T,data_y)[0]
print lstsq_theta


[ 1.50028281  3.9907674 ]

In [165]:
@stochastic
def theta(value=array([2.,5.])):
    """Slope and intercept parameters for a straight line.
    The likelihood corresponds to the prior probability of the parameters."""
    slope, intercept = value
    prob_intercept = uniform_like(intercept, -10, 10)
    prob_slope = log(1./cos(slope)**2)
    return prob_intercept+prob_slope

init_x = data_x.clip(min=0, max=50)

# Inferred true inputs.
x = Uniform('x', lower=0, upper=50, value=init_x)

@deterministic(plot=False)
def modelled_y(x=x, theta=theta):
    """Return y computed from the straight line model, given the
    inferred true inputs and the model paramters."""
    slope, intercept = theta
    return slope*x + intercept


"""
Input error model.

    Define the probability of measuring x knowing the true value.
"""
measured_input = Normal('measured_input', mu=x, tau=2, value=data_x, observed=True)

"""
Output error model.
    Define the probability of measuring x knowing the true value.
    In this case, the true value is assumed to be given by the model, but
    structural errors could be integrated to the analysis as well.
"""
y = Normal('y', mu=modelled_y, tau=2, value=data_y, observed=True)

In [166]:
model = pymc.Model([modelled_y, y, x, theta])

mcmc = pymc.MCMC(model)
mcmc.sample(iter=10000, burn=5000, thin=2)


 [-----------------100%-----------------] 10000 of 10000 complete in 1.8 sec

In [175]:
slope_samples = mcmc.trace('theta')[:,0]
intercept_samples = mcmc.trace('theta')[:,1]

In [176]:
mcmc.trace('theta').stats()


Out[176]:
{'95% HPD interval': array([[ 1.48455922,  1.53974621],
        [ 2.80189691,  4.3501949 ]]),
 'mc error': array([ 0.00143399,  0.04088941]),
 'mean': array([ 1.51314587,  3.67836917]),
 'n': 2500,
 'quantiles': {2.5: array([ 1.48709116,  2.83088279]),
  25: array([ 1.50375163,  3.35934765]),
  50: array([ 1.512289  ,  3.65688925]),
  75: array([ 1.52336374,  4.06440852]),
  97.5: array([ 1.54953016,  4.40607657])},
 'standard deviation': array([ 0.01508375,  0.42340166])}

In [177]:
ax = plt.subplot(211)

plt.hist(slope_samples, histtype='stepfilled', bins=30, alpha=0.85,
         label="posterior of slope", color="#A60628", normed=True)
plt.legend(loc="upper left")
plt.xlabel("slope value")
plt.axvline(true_slope)

ax = plt.subplot(212)
plt.hist(intercept_samples, histtype='stepfilled', bins=30, alpha=0.85,
         label="posterior of intercept", color="#7A68A6", normed=True)
plt.legend(loc="upper left")
plt.xlabel("intercept value")
plt.axvline(true_intercept)

plt.show()



In [178]:
slope_samples.mean()


Out[178]:
1.5131458715226436

In [179]:
slope_samples.std()


Out[179]:
0.015083748072932899

In [180]:
intercept_samples.mean()


Out[180]:
3.6783691715218847

In [181]:
intercept_samples.std()


Out[181]:
0.42340166145651037

In [182]:
pymc.Matplot.plot(mcmc)


Plotting theta_0
Plotting theta_1

In [151]:
pymc_theta = .stats()["mean"]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-151-041206880cfc> in <module>()
----> 1 pymc_theta = t.stats()["mean"]

NameError: name 't' is not defined

Fitting a Gaussian


In [183]:
numpy.random.seed(15)

true_mu = 1.5
true_tau = 50.0
N_samples = 500

mu = pymc.Uniform('mu', lower=-100.0, upper=100.0)
tau = pymc.Gamma('tau', alpha=0.1, beta=0.001)

data = pymc.rnormal( true_mu, true_tau, size=(N_samples,) )

y = pymc.Normal('y',mu,tau,value=data,observed=True)

In [189]:
mcmc=pymc.MCMC([mu, tau, data, y])
mcmc.sample(iter=1000, burn=500, thin=2)


 [-----------------100%-----------------] 1000 of 1000 complete in 0.1 sec

In [190]:
print 'mu',np.mean(mcmc.trace('mu')[:])
print 'tau',np.mean(mcmc.trace('tau')[:])


mu 1.50080274111
tau 46.5589988381

In [188]:
pymc.Matplot.plot(mcmc)


Plotting tau
Plotting mu

In [ ]: